In [1]:
#%%
#os.chdir(r'F:\Github\6103-Final-Project')
In [2]:
import pandas as pd
import numpy as np
import pylab as py
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
def dfChkBasics(dframe, valCnt = False): 
  cnt = 1
  print('\ndataframe Basic Check function -')
  
  try:
    print(f'\n{cnt}: info(): ')
    cnt+=1
    print(dframe.info())
  except: pass

  print(f'\n{cnt}: describe(): ')
  cnt+=1
  print(dframe.describe())

  print(f'\n{cnt}: head() -- ')
  cnt+=1
  print(dframe.head())

  print(f'\n{cnt}: shape: ')
  cnt+=1
  print(dframe.shape)

  if (valCnt):
    print('\nValue Counts for each feature -')
    for colname in dframe.columns :
      print(f'\n{cnt}: {colname} value_counts(): ')
      print(dframe[colname].value_counts())
      cnt +=1
In [4]:
# Dataset
# The data in this project contains 31 variables and more than 200,000 used cars' data. 
# 15 variables are anonymous we will drop them later. 
# 150,000 observations will be train set and 50,000 observation will be test set.
In [5]:
df=pd.read_csv("used_car_train_20200313.csv")
# Origin dataset is too large, so we only research on sample have size 20000.
df=df.sample(n=20000,random_state=2020)
In [6]:
# Read Data
#
# Drop meaningless variables
df=df.iloc[:,:16]
df=df.drop(columns='name')
In [7]:
# Check Data
dfChkBasics(df, valCnt= True)
dataframe Basic Check function -

1: info(): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 143572 to 138606
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             20000 non-null  int64  
 1   regDate            20000 non-null  int64  
 2   model              20000 non-null  int64  
 3   brand              20000 non-null  int64  
 4   bodyType           20000 non-null  int64  
 5   fuelType           20000 non-null  float64
 6   gearbox            20000 non-null  object 
 7   power              20000 non-null  object 
 8   kilometer          20000 non-null  object 
 9   notRepairedDamage  20000 non-null  object 
 10  regionCode         20000 non-null  int64  
 11  seller             20000 non-null  int64  
 12  offerType          20000 non-null  float64
 13  creatDate          20000 non-null  float64
 14  price              20000 non-null  float64
dtypes: float64(4), int64(7), object(4)
memory usage: 2.4+ MB
None

2: describe(): 
              SaleID       regDate         model         brand      bodyType  \
count   20000.000000  2.000000e+04  20000.000000  20000.000000  20000.000000   
mean    75513.315150  2.003381e+07     47.727300      8.094150      1.846000   
std     43180.721509  5.365699e+04     49.676568      7.848335      3.763083   
min         3.000000  1.991000e+07      0.000000      0.000000      0.000000   
25%     38595.750000  1.999091e+07     11.000000      1.000000      0.000000   
50%     75645.000000  2.003090e+07     30.000000      6.000000      1.000000   
75%    113090.750000  2.007110e+07     66.000000     13.000000      3.000000   
max    149996.000000  2.015121e+07    247.000000     39.000000    156.000000   

           fuelType    regionCode        seller     offerType     creatDate  \
count  20000.000000  2.000000e+04  2.000000e+04  2.000000e+04  2.000000e+04   
mean       1.390075  2.150097e+05  2.812609e+05  1.457643e+06  1.820903e+07   
std       13.485880  2.059593e+06  2.364529e+06  5.221320e+06  5.960597e+06   
min        0.000000  0.000000e+00  0.000000e+00  0.000000e+00 -4.169894e+00   
25%        0.000000  7.200000e+02  0.000000e+00  0.000000e+00  2.016031e+07   
50%        0.000000  1.992000e+03  0.000000e+00  0.000000e+00  2.016032e+07   
75%        1.000000  3.672000e+03  0.000000e+00  0.000000e+00  2.016033e+07   
max     1103.000000  2.016041e+07  2.016040e+07  2.016041e+07  2.016041e+07   

              price  
count  20000.000000  
mean    5643.327486  
std     7507.319472  
min       -3.902379  
25%      999.000000  
50%     2950.000000  
75%     7490.000000  
max    99900.000000  

3: head() -- 
        SaleID   regDate  model  brand  bodyType  fuelType gearbox power  \
143572  143572  20030007    180     13         3       0.0     128    15   
82758    82758  20080207     64     21         1       2.0       0    67   
3479      3479  20040611     13      4         0       1.0       1   218   
89329    89329  20070606     26     14         4       0.0       0   140   
90675    90675  20020112     88     14         1       0.0       0    74   

       kilometer notRepairedDamage  regionCode  seller   offerType  \
143572         1              6550           0       0  20160312.0   
82758         15                 0        1696       0         0.0   
3479          15                 0        1844       0         0.0   
89329          9                 0        6864       0         0.0   
90675         15                 0        3545       0         0.0   

         creatDate        price  
143572      1000.0    43.102277  
82758   20160312.0  1800.000000  
3479    20160328.0  6700.000000  
89329   20160403.0  6500.000000  
90675   20160404.0  1990.000000  

4: shape: 
(20000, 15)

Value Counts for each feature -

5: SaleID value_counts(): 
67583     1
45619     1
78823     1
79767     1
121390    1
         ..
11583     1
54592     1
89409     1
35309     1
65536     1
Name: SaleID, Length: 20000, dtype: int64

6: regDate value_counts(): 
20000001    29
20000008    27
20000002    26
20000010    24
20000004    24
            ..
20130911     1
19920201     1
19941011     1
19920507     1
20070205     1
Name: regDate, Length: 3558, dtype: int64

7: model value_counts(): 
0      1568
19     1250
4      1104
1       804
29      690
       ... 
216       1
235       1
236       1
237       1
231       1
Name: model, Length: 242, dtype: int64

8: brand value_counts(): 
0     4172
4     2227
14    2202
10    1922
1     1820
6     1357
9      965
5      586
13     498
11     384
3      336
16     304
7      298
8      293
25     283
27     272
21     193
19     187
15     187
20     174
22     163
12     156
26     128
17     126
30     125
24     111
28      95
32      83
29      52
2       49
31      42
18      38
37      38
34      30
33      30
36      26
23      22
35      19
38       6
39       1
Name: brand, dtype: int64

9: bodyType value_counts(): 
0      6076
1      4776
2      4021
3      1757
4      1228
5      1067
6       884
7       169
60        4
150       3
55        1
71        1
156       1
56        1
101       1
136       1
100       1
68        1
89        1
90        1
75        1
65        1
140       1
80        1
125       1
Name: bodyType, dtype: int64

10: fuelType value_counts(): 
0.0      12928
1.0       6356
2.0        296
15.0       117
0.5         44
         ...  
131.0        1
68.0         1
155.0        1
65.0         1
87.0         1
Name: fuelType, Length: 79, dtype: int64

11: gearbox value_counts(): 
0      14306
1       4242
15       197
-        197
75        90
       ...  
286        1
171        1
108        1
72         1
149        1
Name: gearbox, Length: 158, dtype: int64

12: power value_counts(): 
75      1173
0       1114
15      1067
150      822
140      764
        ... 
2773       1
2392       1
2283       1
487        1
2145       1
Name: power, Length: 511, dtype: int64

13: kilometer value_counts(): 
15      11468
12.5     1920
0         904
10        777
9         697
        ...  
2834        1
1102        1
6272        1
3027        1
559         1
Name: kilometer, Length: 280, dtype: int64

14: notRepairedDamage value_counts(): 
0       14454
-        2455
1        1645
72          9
1530        7
        ...  
4398        1
2133        1
3427        1
930         1
6263        1
Name: notRepairedDamage, Length: 1170, dtype: int64

15: regionCode value_counts(): 
0       1729
419       45
764       32
125       25
450       22
        ... 
2150       1
2134       1
3887       1
4149       1
5925       1
Name: regionCode, Length: 5482, dtype: int64

16: seller value_counts(): 
0           19510
20160310       17
20160331       16
20160401       14
20160314       12
            ...  
5800            1
2100            1
75              1
699             1
1999            1
Name: seller, Length: 134, dtype: int64

17: offerType value_counts(): 
0.000000e+00    18064
2.016031e+07       66
2.016032e+07       63
2.016031e+07       59
2.016032e+07       59
                ...  
9.700000e+03        1
5.600000e+03        1
7.990000e+03        1
3.545805e+01        1
6.399000e+03        1
Name: offerType, Length: 380, dtype: int64

18: creatDate value_counts(): 
 2.016040e+07    724
 2.016031e+07    683
 2.016040e+07    659
 2.016033e+07    658
 2.016032e+07    646
                ... 
 9.490000e+02      1
-3.460437e+00      1
-3.874060e+00      1
 4.667299e+01      1
 1.753440e+00      1
Name: creatDate, Length: 893, dtype: int64

19: price value_counts(): 
 500.000000      284
 1500.000000     249
 2500.000000     230
 1000.000000     205
 3500.000000     203
                ... 
 1720.000000       1
 755.000000        1
 9870.000000       1
 10150.000000      1
-3.239698          1
Name: price, Length: 3489, dtype: int64
In [8]:
# Q1 Car Attribute
# Research the influence of a vehicle’s 
# Brand, Body Type, Fuel Type and Gearbox Type
# on the value of the vehicle.
In [9]:
# Q2 Car Damage
# Research the influence of a vehicle’s 
# Milage, Engine power, whether the car has been Damaged
# on the value of the vehicle.
In [10]:
# Q2.1 EDA

# Q2.1.1 EDA

# Select col and drop NA
dfQ2 = df.iloc[:,[0,7,8,9,14]]
dfQ2.set_index('SaleID')
dfQ2.replace('-',np.nan,inplace=True)
dfQ2=dfQ2.dropna()

dfChkBasics(dfQ2, valCnt= True)
dataframe Basic Check function -

1: info(): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16836 entries, 143572 to 138606
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             16836 non-null  int64  
 1   power              16836 non-null  object 
 2   kilometer          16836 non-null  object 
 3   notRepairedDamage  16836 non-null  object 
 4   price              16836 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 789.2+ KB
None

2: describe(): 
              SaleID         price
count   16836.000000  16836.000000
mean    75636.829591   6172.197967
std     43181.086379   7823.240043
min         3.000000     -3.902379
25%     38755.500000   1250.000000
50%     75706.500000   3490.000000
75%    113221.750000   8076.750000
max    149996.000000  99900.000000

3: head() -- 
        SaleID power kilometer notRepairedDamage        price
143572  143572    15         1              6550    43.102277
82758    82758    67        15                 0  1800.000000
3479      3479   218        15                 0  6700.000000
89329    89329   140         9                 0  6500.000000
90675    90675    74        15                 0  1990.000000

4: shape: 
(16836, 5)

Value Counts for each feature -

5: SaleID value_counts(): 
67583     1
76352     1
64058     1
59960     1
98259     1
         ..
75102     1
15709     1
13660     1
144576    1
65536     1
Name: SaleID, Length: 16836, dtype: int64

6: power value_counts(): 
75      988
150     711
0       707
140     683
15      644
       ... 
3090      1
952       1
941       1
1861      1
7078      1
Name: power, Length: 494, dtype: int64

7: kilometer value_counts(): 
15      9633
12.5    1681
0        904
10       705
9        641
        ... 
6235       1
4152       1
1748       1
3544       1
6057       1
Name: kilometer, Length: 140, dtype: int64

8: notRepairedDamage value_counts(): 
0       14303
1        1645
2384        5
771         4
2012        4
        ...  
58          1
393         1
3660        1
4771        1
6263        1
Name: notRepairedDamage, Length: 776, dtype: int64

9: price value_counts(): 
500.000000      232
1500.000000     211
2500.000000     186
3500.000000     167
1200.000000     159
               ... 
36450.000000      1
18276.000000      1
44.841618         1
44.103896         1
12799.000000      1
Name: price, Length: 2726, dtype: int64
In [11]:
# Change the data type
dfQ2["power"] = dfQ2["power"].astype('float')
dfQ2["kilometer"] = dfQ2["kilometer"].astype('float')
dfQ2["notRepairedDamage"] = dfQ2["notRepairedDamage"].astype('int')
In [12]:
# Change 'notRepairedDamage' to category that only have 0 or 1.
dfQ2=dfQ2[(dfQ2['notRepairedDamage'] == 0) | (dfQ2['notRepairedDamage'] == 1)]
In [13]:
# Drop abnormal value in power, kilometer and price.
dfQ2=dfQ2.drop(dfQ2[dfQ2['power']==0].index)
dfQ2=dfQ2.drop(dfQ2[dfQ2['kilometer']==0].index)
dfQ2=dfQ2.drop(dfQ2[dfQ2['price']==float].index)
In [14]:
# Drop outliers
dfQ2['power'] = dfQ2['power'][dfQ2['power'].between(dfQ2['power'].quantile(.025), dfQ2['power'].quantile(.975))]
dfQ2['kilometer'] = dfQ2['kilometer'][dfQ2['kilometer'].between(dfQ2['kilometer'].quantile(.025), dfQ2['kilometer'].quantile(.975))]
dfQ2['price'] = dfQ2['price'][dfQ2['price'].between(dfQ2['price'].quantile(.025), dfQ2['price'].quantile(.975))]
dfQ2=dfQ2.dropna()
In [15]:
plt.hist(dfQ2['kilometer'], bins='auto')
plt.xlabel("Kilometer")
plt.ylabel("Frequency")
plt.title("Kilometer Histogram")
plt.show()
In [16]:
# Kilometer issue
#
# We think this variable's stratification might come from a mutiple choice question.
# And the last choice might be the kilometer that larger than 15km, so we have so many "15km" at here.
# In order to continue our analysis, we choose to drop most of cars' "kilometer" equal to 15.
dfQ2=dfQ2.drop((dfQ2[dfQ2['kilometer']==15]).sample(frac=0.7,random_state=1).index)

dfChkBasics(dfQ2)
dataframe Basic Check function -

1: info(): 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7667 entries, 82758 to 94147
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   SaleID             7667 non-null   int64  
 1   power              7667 non-null   float64
 2   kilometer          7667 non-null   float64
 3   notRepairedDamage  7667 non-null   int32  
 4   price              7667 non-null   float64
dtypes: float64(3), int32(1), int64(1)
memory usage: 329.4 KB
None

2: describe(): 
              SaleID        power    kilometer  notRepairedDamage  \
count    7667.000000  7667.000000  7667.000000        7667.000000   
mean    76090.443068   123.710056    10.722969           0.073692   
std     43184.208627    48.864183     4.041262           0.261287   
min         7.000000    52.000000     2.000000           0.000000   
25%     38970.500000    84.000000     8.000000           0.000000   
50%     76574.000000   116.000000    12.500000           0.000000   
75%    113826.000000   150.000000    15.000000           0.000000   
max    149996.000000   299.000000    15.000000           1.000000   

              price  
count   7667.000000  
mean    7366.548846  
std     6258.670361  
min      350.000000  
25%     2499.500000  
50%     5500.000000  
75%    10500.000000  
max    28000.000000  

3: head() -- 
        SaleID  power  kilometer  notRepairedDamage    price
82758    82758   67.0       15.0                  0   1800.0
89329    89329  140.0        9.0                  0   6500.0
90675    90675   74.0       15.0                  0   1990.0
137169  137169   75.0        8.0                  0   3100.0
136726  136726  140.0        2.0                  0  13500.0

4: shape: 
(7667, 5)
In [17]:
# Q2.1.2 Normality Plot
# QQPlot    
stats.probplot(dfQ2['power'], dist="norm", plot=py)
py.show()
stats.probplot(dfQ2['kilometer'], dist="norm", plot=py)
py.show()
In [18]:
# Histogram
plt.hist(dfQ2['power'], bins='auto')
plt.xlabel("Power")
plt.ylabel("Frequency")
plt.title("Power Histogram")
plt.show()

plt.hist(dfQ2['kilometer'], bins='auto')
plt.xlabel("Kilometer")
plt.ylabel("Frequency")
plt.title("Kilometer Histogram")
plt.show()
In [19]:
# Boxplot
dfQ2['power'].plot(kind='box')
plt.show()
dfQ2['kilometer'].plot(kind='box')
plt.show()
In [20]:
# Q2.2 Visualization
# Power vs Damage
# Before, we assume the "power" is the power of after using.
fuzzykilo = dfQ2['kilometer'] + np.random.normal(0,0.75, size=len(dfQ2['kilometer']))
fuzzypower = dfQ2['power'] + np.random.normal(0,3.5, size=len(dfQ2['notRepairedDamage']))
plt.plot(fuzzykilo,fuzzypower, 'o', markersize=8 ,alpha = 0.1)
plt.xlabel("Kilometer")
plt.ylabel("Power")
plt.title("Kilometer vs Power")
plt.show()
In [21]:
# Price vs Damage
fuzzykilo = dfQ2['notRepairedDamage'] + np.random.normal(0,0.4, size=len(dfQ2['notRepairedDamage']))
plt.plot(fuzzykilo,dfQ2['price'], 'o', markersize=8 ,alpha = 0.1)
plt.xlabel("Damge")
plt.ylabel("Price")
plt.title("Price vs Damage")
plt.show()
In [22]:
# Price vs Power
plt.plot(fuzzypower,dfQ2['price'], 'o', markersize=8 ,alpha = 0.1)
plt.xlabel("Power")
plt.ylabel("Price")
plt.title("Price vs Power")
plt.show()
In [23]:
# 3-dimention
sns.lmplot('power', 'price', data=dfQ2, hue="notRepairedDamage", x_jitter=3.5, scatter_kws={'alpha': 0.3, 's': 40 } )
plt.xlabel("Power")
plt.ylabel("Price")
plt.title("Price vs Power")
sns.despine()
In [24]:
# Q2.3 Model 
#
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from statsmodels.formula.api import glm
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import linear_model 
In [25]:
# Train-Test Split
xdfQ2 = dfQ2[['power', 'kilometer','notRepairedDamage']]
print(xdfQ2.head())
ydfQ2 = dfQ2['price']
print(ydfQ2.head())

x_train, x_test, y_train, y_test = train_test_split(xdfQ2, ydfQ2, test_size = 0.25, random_state=2020)
        power  kilometer  notRepairedDamage
82758    67.0       15.0                  0
89329   140.0        9.0                  0
90675    74.0       15.0                  0
137169   75.0        8.0                  0
136726  140.0        2.0                  0
82758      1800.0
89329      6500.0
90675      1990.0
137169     3100.0
136726    13500.0
Name: price, dtype: float64
In [26]:
#
# Prediction Model
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

clf1 = SVC()
clf2 = SVC(kernel="linear")
clf3 = LinearSVC()
clf4 = LogisticRegression()
clf5 = DecisionTreeClassifier()
clf6 = KNeighborsClassifier(n_neighbors=3)
clf7 = linear_model.LinearRegression() 
classifiers = [clf1,clf2,clf3,clf4,clf5,clf6,clf7]
#classifiers = [clf7]  
In [27]:
for c in classifiers:
    c.fit(x_train,y_train)
    print('\n%s\n'%(c))
    print(f'train score:  {c.score(x_train,y_train)}')
    print(f'test score:  {c.score(x_test,y_test)}')
SVC()

train score:  0.016
test score:  0.014084507042253521

SVC(kernel='linear')

train score:  0.029391304347826087
test score:  0.014606155451225874

LinearSVC()

train score:  0.001391304347826087
test score:  0.001564945226917058

LogisticRegression()

train score:  0.01791304347826087
test score:  0.017214397496087636

DecisionTreeClassifier()

train score:  0.23182608695652174
test score:  0.01773604590505999

KNeighborsClassifier(n_neighbors=3)

train score:  0.1542608695652174
test score:  0.013041210224308816

LinearRegression()

train score:  0.5665805054861399
test score:  0.5744119574407569
In [28]:
for c in classifiers:
  print('\n%s\n'%(c))
  print(cross_val_score(c, x_test, y_test, cv= 10))
  print(f'CV mean:  {np.mean(cross_val_score(c, x_test, y_test, cv= 10))}')
SVC()

[0.00520833 0.015625   0.01041667 0.01041667 0.00520833 0.015625
 0.00520833 0.0104712  0.01570681 0.01570681]
CV mean:  0.010959315008726003

SVC(kernel='linear')

[0.015625   0.015625   0.03125    0.015625   0.015625   0.02604167
 0.015625   0.0104712  0.0052356  0.02094241]
CV mean:  0.017206588132635252

LinearSVC()

[0.         0.         0.         0.         0.         0.00520833
 0.00520833 0.         0.         0.        ]
CV mean:  0.005740074171029667

LogisticRegression()

[0.00520833 0.02083333 0.03125    0.00520833 0.00520833 0.015625
 0.015625   0.0104712  0.01570681 0.0104712 ]
CV mean:  0.01356075479930192

DecisionTreeClassifier()

[0.01041667 0.00520833 0.03125    0.00520833 0.01041667 0.01041667
 0.00520833 0.0052356  0.0052356  0.0104712 ]
CV mean:  0.009906740837696337

KNeighborsClassifier(n_neighbors=3)

[0.00520833 0.00520833 0.02083333 0.00520833 0.015625   0.01041667
 0.00520833 0.0052356  0.0104712  0.0104712 ]
CV mean:  0.00938863438045375

LinearRegression()

[0.52748413 0.5026023  0.5814749  0.5239129  0.49056247 0.57675466
 0.63202999 0.61932591 0.60695404 0.62903229]
CV mean:  0.5690133593690981
In [29]:
#
classifiers = [clf6]
# ROC-AUC (Canceled)

#for c in classifiers:
  model = c.fit(x_train, y_train)
  y_predict_proba = model.predict_proba(x_test)
  ns_probs = [0 for _ in range(len(y_test))]
  y_predict_proba = y_predict_proba[:, 1]
  ns_auc = roc_auc_score(y_test, ns_probs)
  lr_auc = roc_auc_score(y_test, y_predict_proba)
  print('No Skill: ROC AUC=%.3f' % (ns_auc))
  print('Logistic: ROC AUC=%.3f' % (lr_auc))
  ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
  lr_fpr, lr_tpr, _ = roc_curve(y_test, y_predict_proba)
  plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
  plt.plot(lr_fpr, lr_tpr, marker='.', label='\n%s\n'%(c))
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.legend()
  plt.show()
  File "<ipython-input-29-26e71530ccb8>", line 6
    model = c.fit(x_train, y_train)
    ^
IndentationError: unexpected indent
In [30]:
from statsmodels.formula.api import ols

lm = ols(formula='price ~ power + kilometer + C(notRepairedDamage)', data=dfQ2).fit()
print( lm.summary() )
np.mean(lm.predict(dfQ2))
#print(modelpredicitons.head())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.569
Model:                            OLS   Adj. R-squared:                  0.568
Method:                 Least Squares   F-statistic:                     3368.
Date:                Fri, 04 Dec 2020   Prob (F-statistic):               0.00
Time:                        15:11:09   Log-Likelihood:                -74678.
No. Observations:                7667   AIC:                         1.494e+05
Df Residuals:                    7663   BIC:                         1.494e+05
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                  5866.9829    173.273     33.860      0.000    5527.320    6206.646
C(notRepairedDamage)[T.1] -2648.0888    180.524    -14.669      0.000   -3001.965   -2294.213
power                        76.8403      0.965     79.659      0.000      74.949      78.731
kilometer                  -728.4559     11.687    -62.331      0.000    -751.366    -705.546
==============================================================================
Omnibus:                      455.756   Durbin-Watson:                   1.988
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              859.158
Skew:                           0.437   Prob(JB):                    2.73e-187
Kurtosis:                       4.387   Cond. No.                         519.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Out[30]:
7366.548845702284
In [31]:
# Q2.2 Visualization
# Power vs Damage
# Before, we assume the "power" is the power of after using.
fuzzykilo = dfQ2['kilometer'] + np.random.normal(0,0.75, size=len(dfQ2['kilometer']))
fuzzypower = dfQ2['power'] + np.random.normal(0,3.5, size=len(dfQ2['notRepairedDamage']))
plt.plot(fuzzykilo,fuzzypower, 'o', markersize=8 ,alpha = 0.1)
z = np.polyfit(fuzzykilo,fuzzypower, 1)
p = np.poly1d(z)
plt.plot(fuzzykilo,p(fuzzykilo),"r--")
plt.xlabel("Kilometer")
plt.ylabel("Power")
plt.title("Kilometer vs Power")
plt.show()
In [32]:
sns.lmplot('kilometer', 'power', data=dfQ2, y_jitter=0.75, x_jitter=3.5, scatter_kws={'alpha': 0.3, 's': 40 } )
Out[32]:
<seaborn.axisgrid.FacetGrid at 0x1df5d2cbc10>
In [33]:
sns.regplot('kilometer', 'power', data=dfQ2, y_jitter=0.75, x_jitter=3.5, scatter_kws={'alpha': 0.3, 's': 40 } )
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1df5cbba4c0>
In [34]:
# 3-dimention
sns.lmplot('power', 'price', data=dfQ2, hue="notRepairedDamage", x_jitter=3.5, scatter_kws={'alpha': 0.3, 's': 40 } )
plt.xlabel("Power")
plt.ylabel("Price")
plt.title("Price vs Power")
sns.despine()
In [35]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif = pd.DataFrame()
vif["variables"] = dfQ2.columns
vif["VIF"] = [ variance_inflation_factor(dfQ2.values, i) for i in range(dfQ2.shape[1]) ]

print(vif)
           variables        VIF
0             SaleID   3.414661
1              power  13.186055
2          kilometer   7.043126
3  notRepairedDamage   1.109489
4              price   4.940568
In [36]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfQ2=dfQ2[:,1:4]
vif = pd.DataFrame()
vif["variables"] = dfQ2.columns
vif["VIF"] = [ variance_inflation_factor(dfQ2.values, i) for i in range(dfQ2.shape[1]) ]

print(vif)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
f:\Github\6103-Final-Project\UsedCar_Code.py in <module>
      1 from statsmodels.stats.outliers_influence import variance_inflation_factor
----> 2 dfQ2=dfQ2[:,1:4]
      3 vif = pd.DataFrame()
      4 vif["variables"] = dfQ2.columns
      5 vif["VIF"] = [ variance_inflation_factor(dfQ2.values, i) for i in range(dfQ2.shape[1]) ]

F:\Anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2798             if self.columns.nlevels > 1:
   2799                 return self._getitem_multilevel(key)
-> 2800             indexer = self.columns.get_loc(key)
   2801             if is_integer(indexer):
   2802                 indexer = [indexer]

F:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2644                 )
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:
   2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(slice(None, None, None), slice(1, 4, None))' is an invalid key
In [37]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfQ2=dfQ2[:,[1,2,3,4]]
vif = pd.DataFrame()
vif["variables"] = dfQ2.columns
vif["VIF"] = [ variance_inflation_factor(dfQ2.values, i) for i in range(dfQ2.shape[1]) ]

print(vif)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
f:\Github\6103-Final-Project\UsedCar_Code.py in <module>
      1 from statsmodels.stats.outliers_influence import variance_inflation_factor
----> 2 dfQ2=dfQ2[:,[1,2,3,4]]
      3 vif = pd.DataFrame()
      4 vif["variables"] = dfQ2.columns
      5 vif["VIF"] = [ variance_inflation_factor(dfQ2.values, i) for i in range(dfQ2.shape[1]) ]

F:\Anaconda\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
   2798             if self.columns.nlevels > 1:
   2799                 return self._getitem_multilevel(key)
-> 2800             indexer = self.columns.get_loc(key)
   2801             if is_integer(indexer):
   2802                 indexer = [indexer]

F:\Anaconda\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
   2644                 )
   2645             try:
-> 2646                 return self._engine.get_loc(key)
   2647             except KeyError:
   2648                 return self._engine.get_loc(self._maybe_cast_indexer(key))

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()

TypeError: '(slice(None, None, None), [1, 2, 3, 4])' is an invalid key
In [38]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
dfQ2=dfQ2.iloc[:,[1,2,3,4]]
vif = pd.DataFrame()
vif["variables"] = dfQ2.columns
vif["VIF"] = [ variance_inflation_factor(dfQ2.values, i) for i in range(dfQ2.shape[1]) ]

print(vif)
           variables        VIF
0              power  13.112984
1          kilometer   5.983703
2  notRepairedDamage   1.106669
3              price   4.810876
In [39]:
lm = ols(formula='price ~ + kilometer + C(notRepairedDamage)', data=dfQ2).fit()
print( lm.summary() )
np.mean(lm.predict(dfQ2))
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  price   R-squared:                       0.211
Model:                            OLS   Adj. R-squared:                  0.211
Method:                 Least Squares   F-statistic:                     1028.
Date:                Fri, 04 Dec 2020   Prob (F-statistic):               0.00
Time:                        16:31:16   Log-Likelihood:                -76990.
No. Observations:                7667   AIC:                         1.540e+05
Df Residuals:                    7664   BIC:                         1.540e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=============================================================================================
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                   1.47e+04    180.068     81.614      0.000    1.43e+04     1.5e+04
C(notRepairedDamage)[T.1] -3408.4966    243.722    -13.985      0.000   -3886.259   -2930.734
kilometer                  -660.1025     15.758    -41.890      0.000    -690.992    -629.213
==============================================================================
Omnibus:                     1087.220   Durbin-Watson:                   2.011
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1635.551
Skew:                           1.031   Prob(JB):                         0.00
Kurtosis:                       3.932   Cond. No.                         44.2
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Out[39]:
7366.548845702534